library(tidyverse)
## ─ Attaching packages ───────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ─ Conflicts ─────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Ex. 7.3.4

1

Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.

ggpairs(diamonds %>% select(x,y,z))

ダイアモンドは円形なので一番相関の大きいx,yがlength,widthでしょう.

2

Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)

grid.arrange(
ggplot(diamonds,aes(x=price))+
  geom_histogram(binwidth=50),

ggplot(diamonds,aes(x=price))+
  geom_histogram(binwidth=50)+
  coord_cartesian(xlim=c(1000,2000))
)

価格1500にデータが存在しない!不思議!

3

How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?

print(dim(diamonds %>% filter(carat==0.99)))
## [1] 23 10
print(dim(diamonds %>% filter(carat==1)))
## [1] 1558   10

ちゃんとキリのいいcaratで作りたいんちゃう.

4

Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?

xlim() vs coor_cartesian

grid.arrange(
ggplot(diamonds,aes(x=price))+
  geom_histogram(binwidth=50)+
  coord_cartesian(xlim=c(1000,2000)),
ggplot(diamonds,aes(x=price))+
  geom_histogram(binwidth=50)+
  xlim(c(1000,2000))
)
## Warning: Removed 44232 rows containing non-finite values (stat_bin).

ylim() vs coor_cartesian

grid.arrange(
ggplot(diamonds,aes(x=price))+
  geom_histogram(),
ggplot(diamonds,aes(x=price))+
  geom_histogram()+
  coord_cartesian(ylim=c(0,100)),
ggplot(diamonds,aes(x=price))+
  geom_histogram()+
  ylim(c(0,100))
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 29 rows containing missing values (geom_bar).

この結果はかなり謎…一つ目,二つ目は妥当,なぜylimで消える??

Ex. 7.4.1

1

What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference?

まず,各列の欠測の数え方

apply(X=is.na(diamonds),MARGIN=2,sum)
##   carat     cut   color clarity   depth   table   price       x       y 
##       0       0       0       0       0       0       0       0       0 
##       z 
##       0
library(nycflights13)
apply(X=is.na(flights),MARGIN=2,sum)
##           year          month            day       dep_time sched_dep_time 
##              0              0              0           8255              0 
##      dep_delay       arr_time sched_arr_time      arr_delay        carrier 
##           8255           8713              0           9430              0 
##         flight        tailnum         origin           dest       air_time 
##              0           2512              0              0           9430 
##       distance           hour         minute      time_hour 
##              0              0              0              0

geom_bar用にmissingあり離散列が欲しいがない(tailnumはユニークidが多い)ので無理やり作る

flights2 <- flights %>%
  mutate(delay_type = if_else(dep_delay>0,'late','fast'))
p1 <- ggplot(data=flights,aes(x=air_time))+geom_histogram()
p2 <- ggplot(data=flights2,aes(x=delay_type))+geom_bar()
grid.arrange(p1,p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9430 rows containing non-finite values (stat_bin).

つまりbar plotはNAも個別のラベルとしてカウントする!!

2

What does na.rm = TRUE do in mean() and sum()?

flights %>%
  group_by(dest) %>%
  summarise(avg=mean(air_time))
## # A tibble: 105 x 2
##     dest      avg
##    <chr>    <dbl>
##  1   ABQ 249.1693
##  2   ACK       NA
##  3   ALB       NA
##  4   ANC 413.1250
##  5   ATL       NA
##  6   AUS       NA
##  7   AVL       NA
##  8   BDL       NA
##  9   BGR       NA
## 10   BHM       NA
## # ... with 95 more rows
flights %>%
  group_by(dest) %>%
  summarise(avg=mean(air_time,na.rm=TRUE))
## # A tibble: 105 x 2
##     dest       avg
##    <chr>     <dbl>
##  1   ABQ 249.16929
##  2   ACK  42.06818
##  3   ALB  31.78708
##  4   ANC 413.12500
##  5   ATL 112.93045
##  6   AUS 212.72791
##  7   AVL  89.88889
##  8   BDL  25.46602
##  9   BGR  54.11732
## 10   BHM 122.77695
## # ... with 95 more rows

NAが一つでもあると平均や合計の結果が全てNAになってしまうのでna.rm=TRUEで無視しましょう.

Ex. 7.5.1.1

1

Use what you’ve learned to improve the visualisation of the departure times of cancelled vs. non-cancelled flights.

cancelled <- nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min/60
  ) %>%
  select(cancelled,sched_hour,sched_min,sched_dep_time)
cancelled
## # A tibble: 336,776 x 4
##    cancelled sched_hour sched_min sched_dep_time
##        <lgl>      <dbl>     <dbl>          <dbl>
##  1     FALSE          5        15       5.250000
##  2     FALSE          5        29       5.483333
##  3     FALSE          5        40       5.666667
##  4     FALSE          5        45       5.750000
##  5     FALSE          6         0       6.000000
##  6     FALSE          5        58       5.966667
##  7     FALSE          6         0       6.000000
##  8     FALSE          6         0       6.000000
##  9     FALSE          6         0       6.000000
## 10     FALSE          6         0       6.000000
## # ... with 336,766 more rows
p1 <- ggplot(cancelled,aes(x=sched_dep_time))+geom_freqpoly(aes(color=cancelled),binwidth=1/4)
p2 <- ggplot(cancelled,aes(x=sched_dep_time,y=..density..))+geom_freqpoly(aes(color=cancelled),binwidth=1/4)
p3 <- ggplot(cancelled,aes(x=cancelled,y=sched_dep_time))+geom_boxplot()+coord_flip()
grid.arrange(p1,p2,p3)

2

What variable in the diamonds dataset is most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive?

#

ここちょっとむずいので後

3

Install the ggstance package, and create a horizontal boxplot. How does this compare to using coord_flip()?

library(ggstance)
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
p1 <- ggplot(mpg,mapping=aes(x=hwy,y=class))+geom_boxploth()
p2 <- ggplot(diamonds,mapping=aes(x=price,y=color))+geom_boxploth()
p3 <- ggplot(diamonds,mapping=aes(x=price,y=cut,fill=color))+geom_boxploth()
grid.arrange(p1,p2,p3)

まじで,coord_flip()使えばいいと思う.x, yの指定が逆になるので注意.

4

One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs cut. What do you learn? How do you interpret the plots?

library(lvplot)
grid.arrange(ggplot(diamonds)+geom_boxplot(aes(x=cut,y=price)),
ggplot(diamonds)+geom_lv(aes(x=cut,y=price,fill=..LV..))+
  scale_fill_lv()
)

geom_lvちょっとよくわからんな...

5

Compare and contrast geom_violin() with a facetted geom_histogram(), or a coloured geom_freqpoly(). What are the pros and cons of each method?

6

If you have a small dataset, it’s sometimes useful to use geom_jitter() to see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does.

Ex. 7.5.2.1

1

How could you rescale the count dataset above to more clearly show the distribution of cut within colour, or colour within cut?

scale_fill_gradient()をいじる.特に対数変換がいい気がする.

grid.arrange(
diamonds%>%
  count(color,cut) %>%
  ggplot(aes(x=color,y=cut))+
  geom_tile(aes(fill=n),position = 'identity')+
  scale_fill_gradient(low='white',high='steelblue')+
  labs(title='w/o log scaling'),
diamonds%>%
  count(color,cut) %>%
  ggplot(aes(x=color,y=cut))+
  geom_tile(aes(fill=n),position = 'identity')+
  scale_fill_gradient(low='white',high='steelblue',trans='log')+
  labs(title='with log scaling')
)

2

Use geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?

library(nycflights13)
flights %>%
  group_by(dest,month) %>%
  summarise(avdelay = mean(dep_delay)) %>%
  ggplot(aes(x=dest,y=month))+
  geom_tile(aes(fill=avdelay))+
  coord_flip()

3

Why is it slightly better to use aes(x = color, y = cut) rather than aes(x = cut, y = color) in the example above?

grid.arrange(
diamonds%>%
  count(color,cut) %>%
  ggplot(aes(x=color,y=cut))+
  geom_tile(aes(fill=n))+
  scale_fill_gradient(trans='log'),
diamonds%>%
  count(color,cut) %>%
  ggplot(aes(x=cut,y=color))+
  geom_tile(aes(fill=n))+
  scale_fill_gradient(trans='log')
)

なんだろう...

  • cutの方がvalueの文字列が長いのでy軸に置いといたほうがつぶれにくい.

くらいじゃないかな?

Ex. 7.5.3.1

1

Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price?

ggplot(diamonds,mapping=aes(x=carat,y=price))+
  geom_boxplot(aes(group=cut_number(carat,20)))

ggplot(diamonds,mapping=aes(x=price))+
  geom_freqpoly(aes(color=cut_number(carat,10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(diamonds,mapping=aes(x=price))+
  geom_freqpoly(aes(color=cut_width(carat,0.5)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  • まぁ2番目みたいに固定幅ではなくbinの個数で指定した方がいいかもな

2

Visualise the distribution of carat, partitioned by price.

ggplot(diamonds,aes(x=price,y=carat))+
  geom_bin2d()+
  scale_fill_gradient2(trans='log')

3

How does the price distribution of very large diamonds compare to small diamonds. Is it as you expect, or does it surprise you?

diamonds4 <- diamonds %>% mutate(smaller = carat<3)%>%
  select(smaller,everything()) 
ggplot(data = diamonds4,aes(x=carat,y=price))+
  geom_point(aes(color=smaller),alpha=0.5)+
  geom_smooth(aes(linetype=smaller),se=FALSE)
## `geom_smooth()` using method = 'gam'

4

Combine two of the techniques you’ve learned to visualise the combined distribution of cut, carat, and price.

ggplot(data=diamonds)+
  geom_point(aes(x=carat,y=price),alpha=1/50)+
  facet_wrap(~cut,nrow=2)

ggplot(data=diamonds)+
  geom_hex(aes(x=carat,y=price))+
  facet_wrap(~cut,nrow=2) # hexとか使ったほうが軽くていいね

ggplot(data = diamonds,aes(x=carat,y=price))+
  geom_point(aes(color=cut),alpha=0.1)+
  geom_smooth(aes(color=cut),se=FALSE)
## `geom_smooth()` using method = 'gam'

# caratが同じならやはりcutが悪いほうが安い

#下だと交絡にやられてるな
ggplot(data=diamonds,aes(x=cut,y=price))+
  geom_boxplot()

  • 最後のグラフではcutが悪いほうが価格が安いという直感に反する結果になっている
  • しかし,一つ前のグラフをみると同じcaratならFair(最も悪いカット)がもっとも安いという結果になり直感に合う.

5

Two dimensional plots reveal outliers that are not visible in one dimensional plots. For example, some points in the plot below have an unusual combination of x and y values, which makes the points outliers even though their x and y values appear normal when examined separately.

ggpairs(diamonds%>%select(x,y))

2d plotだとある外れ値が各軸に射影するとなくなるね.